library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggthemes)
library(ggrepel)
library(showtext)
## Loading required package: sysfonts
## Loading required package: showtextdb
font_add_google("Lato", "lato")
showtext_auto()
babynames <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-03-22/babynames.csv')
## Rows: 1924665 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): sex, name
## dbl (3): year, n, prop
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
babynames_letter <- babynames %>%
mutate(name = tolower(name)) %>%
group_by(year, sex, name, n) %>%
summarise(letter = unlist(str_split(name, '')))
## `summarise()` has grouped output by 'year', 'sex', 'name', 'n'. You can override using the `.groups` argument.
babynames_letter %>%
group_by(sex, letter) %>%
summarise(
n = sum(n)
) %>%
group_by(sex) %>%
mutate(
prop = n/sum(n)
) %>%
ggplot() +
geom_col(aes(letter, prop, fill = sex), position = 'dodge')
## `summarise()` has grouped output by 'sex'. You can override using the `.groups` argument.

plot_dat <- babynames_letter %>%
mutate(sex = ifelse(sex == 'F', 'Female', 'Male')) %>%
group_by(year, sex, letter) %>%
summarise(
n = sum(n)
) %>%
group_by(year, sex) %>%
mutate(
prop = n/sum(n)
) %>%
filter((letter %in% c('a', 'e', 'i', 'o', 'u')))
## `summarise()` has grouped output by 'year', 'sex'. You can override using the `.groups` argument.
lab_let <- plot_dat %>%
group_by(sex, letter) %>%
filter(year == max(year) | year == min(year)) %>%
mutate(year = ifelse(year == max(year), year+2, year-2))
plot_dat %>%
ggplot() +
geom_line(aes(year, prop, color = sex, group = interaction(sex, letter))) +
geom_text(aes(year, prop, color = sex, group = letter, label = letter),
data = lab_let) +
scale_y_sqrt() +
theme_few()

set.seed(1990)
plot_dat <- babynames_letter %>%
mutate(sex = ifelse(sex == 'F', 'Female', 'Male')) %>%
group_by(year, sex, letter) %>%
summarise(
n = sum(n)
) %>%
group_by(year, sex) %>%
mutate(
prop = n/sum(n)
) %>%
# filter(!(letter %in% c('a', 'e', 'i', 'o', 'u'))) %>%
mutate(letter = letter %>% toupper()) %>%
ungroup() %>%
mutate(letter = factor(letter, sample(LETTERS)))
## `summarise()` has grouped output by 'year', 'sex'. You can override using the `.groups` argument.
lab_let <- plot_dat %>%
group_by(sex, letter) %>%
filter(year == max(year) | year == min(year)) %>%
mutate(year = ifelse(year == max(year), year, year))
plot_dat %>%
ggplot() +
geom_line(aes(year, prop, color = letter,
group = interaction(sex, letter))) +
geom_text_repel(aes(year, prop, color = letter, label = letter),
data = filter(lab_let, year == max(year)),
direction = "y", hjust = "left", nudge_x = 30,
max.overlaps = Inf, min.segment.length = 0,
segment.color = 'black', seed = 0, size = 4) +
geom_text_repel(aes(year, prop, color = letter, label = letter),
data = filter(lab_let, year == min(year)),
direction = "y", hjust = "left", nudge_x = -30,
max.overlaps = Inf, min.segment.length = 0,
segment.color = 'black', seed = 1, size = 4) +
scale_y_sqrt(
breaks = c(0.001, 0.007, seq(0.02, 0.2, 0.02)),
sec.axis = dup_axis()
) +
scale_x_continuous(
expand = expansion(mult = 0.3),
breaks = seq(1880, 2020, 20)
) +
theme_few() +
facet_wrap(~sex) +
theme(
legend.position = 'none'
)+
ylab('Proportion') +
xlab('Year') +
labs(
title = 'The popularity of the letters used in newborn baby names has changed over the years',
caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
)+
theme(
text = element_text(family = 'lato'),
axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5)
)

plt_fun <- function(highlight) {
plot_dat %>%
ggplot() +
geom_line(aes(year, prop, color = letter,
size = letter %in% highlight,
alpha = letter %in% highlight,
group = interaction(sex, letter))) +
geom_text_repel(aes(year, prop, color = letter, label = letter,
alpha = letter %in% highlight),
data = filter(lab_let, year == max(year)),
direction = "y", hjust = "left", nudge_x = 30,
max.overlaps = Inf, min.segment.length = 0,
segment.color = 'black', seed = 0, size = 4
) +
geom_text_repel(aes(year, prop, color = letter, label = letter,
alpha = letter %in% highlight),
data = filter(lab_let, year == min(year)),
direction = "y", hjust = "left", nudge_x = -30,
max.overlaps = Inf, min.segment.length = 0,
segment.color = 'black', seed = 1, size = 4) +
scale_y_sqrt(
breaks = c(0, 0.001, 0.007, seq(0.02, 0.2, 0.02)),
sec.axis = dup_axis()
) +
scale_x_continuous(
expand = expansion(mult = 0.3),
breaks = seq(1880, 2020, 20)
) +
theme_few() +
facet_wrap(~sex) +
theme(
legend.position = 'none'
) +
scale_size_discrete(range = c(0.5, 1)) +
scale_alpha_discrete(range = c(0.3, 1)) +
ylab('Proportion') +
xlab('Year')
}
plt_fun(c('A', 'E', 'I', 'O', 'U')) +
labs(
title = 'The vowels',
caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
) +
theme(
text = element_text(family = 'lato'),
axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5)
)
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.

babynames %>%
filter(year == '1960', sex == 'M', str_detect(tolower(name), 'f')) %>%
mutate(sum = sum(prop))
## # A tibble: 263 × 6
## year sex name n prop sum
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1960 M Jeffrey 28831 0.0133 0.0418
## 2 1960 M Frank 10759 0.00497 0.0418
## 3 1960 M Jeff 8509 0.00393 0.0418
## 4 1960 M Jeffery 7656 0.00354 0.0418
## 5 1960 M Frederick 3478 0.00161 0.0418
## 6 1960 M Fred 3398 0.00157 0.0418
## 7 1960 M Clifford 2465 0.00114 0.0418
## 8 1960 M Francis 2421 0.00112 0.0418
## 9 1960 M Alfred 2405 0.00111 0.0418
## 10 1960 M Franklin 1534 0.000708 0.0418
## # … with 253 more rows
babynames %>%
filter(year == '1960', sex == 'M')
## # A tibble: 4,590 × 5
## year sex name n prop
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 1960 M David 85928 0.0397
## 2 1960 M Michael 84183 0.0389
## 3 1960 M James 76842 0.0355
## 4 1960 M John 76096 0.0351
## 5 1960 M Robert 72369 0.0334
## 6 1960 M Mark 58731 0.0271
## 7 1960 M William 49354 0.0228
## 8 1960 M Richard 43561 0.0201
## 9 1960 M Thomas 39279 0.0181
## 10 1960 M Steven 33895 0.0157
## # … with 4,580 more rows
plt_fun(c('F', 'S', 'O')) +
labs(
title = 'The rise and fall of letters',
caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
) +
theme(
text = element_text(family = 'lato'),
axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5)
)
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.

plt_fun(c('R', 'W')) +
labs(
title = 'Decreasing trends',
caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
) +
theme(
text = element_text(family = 'lato'),
axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5)
)
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.

plt_fun(c('K', 'X', 'N')) +
labs(
title = 'The newbies',
caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
) +
theme(
text = element_text(family = 'lato'),
axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5)
)
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.
